%load_ext autoreload
%autoreload 2Plotting for results
This notebook produces all results plots. It generates some gap in the data, fill with a method (filter, MDS …), compute metrics and then makes all relevant plots
import altair as altfrom meteo_imp.kalman.results import *
from meteo_imp.data import *
from meteo_imp.utils import *
import pandas as pd
import numpy as np
from pyprojroot import here
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from IPython.display import SVG, Image
from meteo_imp.kalman.results import _plot_timeseries, _get_labels
from functools import partial
from contextlib import redirect_stderr
import io
import polars as pl
from fastai.vision.data import get_grid
import cairosvgthe generation of a proper pdf is complex as altair_render doesn’t support XOffset, so plotsare first renderder to svg using vl-convert and then to pdf using cairosvg. However this last methods doesn’t support negative numbers …
Due to the high number of samples also cannot use the browser render in the notebook so using vl-convert to a png for the visualization in the notebook
import vl_convert as vlc
from pyprojroot import here
base_path_img = here("manuscript/Master Thesis - Evaluation of Kalman filter for meteorological time series imputation for Eddy Covariance applications - Simone Massaro/images/")
base_path_tbl = here("manuscript/Master Thesis - Evaluation of Kalman filter for meteorological time series imputation for Eddy Covariance applications - Simone Massaro/tables/")
base_path_img.mkdir(exist_ok=True), base_path_tbl.mkdir(exist_ok=True)
def save_show_plot(plot,
path,
altair_render=False # use altair render for pdf?
):
plt_json = plot.to_json()
if not altair_render:
svg_data = vlc.vegalite_to_svg(vl_spec=plt_json)
with open(base_path_img / (path + ".svg"), 'w') as f:
f.write(svg_data)
cairosvg.svg2pdf(file_obj=open(base_path_img / (path + ".svg")), write_to=str(base_path_img / (path + ".pdf")))
else:
with redirect_stderr(io.StringIO()):
plot.save(base_path_img / (path + ".pdf"))
# render to image for displaying in notebook
png_data = vlc.vegalite_to_png(vl_spec=plot.to_json(), scale=1)
return Image(png_data)reset_seed()
n_rep = 500hai = pd.read_parquet(hai_big_path).reindex(columns=var_type.categories)
hai_era = pd.read_parquet(hai_era_big_path)alt.data_transformers.disable_max_rows() # it is safe to do so as the plots are rendered using vl-convert and then showed as imagesDataTransformerRegistry.enable('default')
Correlation
import matplotlib.pyplot as pltimport statsmodels.api as smdef auto_corr_df(data, nlags=96):
autocorr = {}
for col in data.columns:
autocorr[col] = sm.tsa.acf(data[col], nlags=nlags)
return pd.DataFrame(autocorr)auto_corr = auto_corr_df(hai).reset_index(names="gap_len").melt(id_vars="gap_len")
auto_corr.gap_len = auto_corr.gap_len / 2auto_corr| gap_len | variable | value | |
|---|---|---|---|
| 0 | 0.0 | TA | 1.000000 |
| 1 | 0.5 | TA | 0.998595 |
| 2 | 1.0 | TA | 0.995814 |
| 3 | 1.5 | TA | 0.992141 |
| 4 | 2.0 | TA | 0.987630 |
| ... | ... | ... | ... |
| 868 | 46.0 | TS | 0.959680 |
| 869 | 46.5 | TS | 0.961116 |
| 870 | 47.0 | TS | 0.962085 |
| 871 | 47.5 | TS | 0.962551 |
| 872 | 48.0 | TS | 0.962480 |
873 rows × 3 columns
p = (alt.Chart(auto_corr).mark_line().encode(
x = alt.X('gap_len', title="Gap length [h]", axis = alt.Axis(values= [12, 24, 36, 48])),
y = alt.Y("value", title="correlation"),
color=alt.Color("variable", scale=meteo_scale, title="Variable"),
facet =alt.Facet('variable', columns=3, sort = meteo_scale.domain, title=None,
header = alt.Header(labelFontWeight="bold", labelFontSize=20))
)
.properties(height=120, width=250)
.resolve_scale(y='independent', x = 'independent')
.pipe(plot_formatter))
save_show_plot(p, "temporal_autocorrelation")
axes = get_grid(1,1,1, figsize=(10,8))
sns.set(font_scale=1.25)
sns.heatmap(hai.corr(), annot=True, vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200), ax=axes[0], square=True, cbar=True)
# axes[0].set(xlabel="Variable", ylabel="Variable", title="Inter-variable Correlation");
# size_old = plt.rcParams["axes.labelsize"]
# w_old = plt.rcParams["axes.labelweight"]
# plt.rcParams["axes.labelsize"] = 30
# plt.rcParams["axes.labelweight"] = 'bold'
plt.tight_layout()
plt.xticks(weight = 'bold')
plt.yticks(weight = 'bold')
with matplotlib.rc_context({"axes.labelsize": 30}):
plt.savefig(base_path_img / "correlation.pdf")
plt.show()
# plt.rcParams["axes.labelsize"] = size_old
# plt.rcParams["axes.labelweight"] = w_old
Comparison Imputation methods
base_path = here("analysis/results/trained_models")def l_model(x, base_path=base_path): return torch.load(base_path / x)models_var = pd.DataFrame.from_records([
{'var': 'TA', 'model': l_model("TA_specialized_gap_6-336_v3_0.pickle",base_path)},
{'var': 'SW_IN', 'model': l_model("SW_IN_specialized_gap_6-336_v2_0.pickle",base_path)},
{'var': 'LW_IN', 'model': l_model("LW_IN_specialized_gap_6-336_v1.pickle",base_path)},
{'var': 'VPD', 'model': l_model("VPD_specialized_gap_6-336_v2_0.pickle",base_path)},
{'var': 'WS', 'model': l_model("WS_specialized_gap_6-336_v1.pickle",base_path)},
{'var': 'PA', 'model': l_model("PA_specialized_gap_6-336_v3_0.pickle",base_path)},
{'var': 'P', 'model': l_model("1_gap_varying_6-336_v3.pickle",base_path)},
{'var': 'TS', 'model': l_model("TS_specialized_gap_6-336_v2_0.pickle",base_path)},
{'var': 'SWC', 'model': l_model("SWC_specialized_gap_6-336_v2_1.pickle",base_path)},
])@cache_disk(cache_dir / "the_results")
def get_the_results(n_rep=20):
reset_seed()
comp_Av = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446, time_series=False)
results_Av = comp_Av.compare(gap_len = [12,24, 48, 336], var=list(hai.columns), n_rep=n_rep)
return results_Av
results_Av = get_the_results(n_rep)State of the art
the first plot is a time series using only state-of-the-art methods
reset_seed()
comp_ts = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 48+100, time_series=True, rmse=False)
results_ts = comp_ts.compare(gap_len = [48], var=list(hai.columns), n_rep=1) res_ts = results_ts.query("method != 'Kalman Filter'")
res_ts_plot = pd.concat([unnest_predictions(row, ctx_len=72) for _,row in res_ts.iterrows()])scale_sota = alt.Scale(domain=["ERA-I", "MDS"], range=list(sns.color_palette('Dark2', 3).as_hex())[1:])p = (facet_wrap(res_ts_plot, partial(_plot_timeseries, scale_color=scale_sota, err_band = False), col="var",
y_labels = _get_labels(res_ts_plot, 'mean', None),
)
.pipe(plot_formatter)
)
save_show_plot(p, "timeseries_sota", altair_render=True)
Percentage improvement
results_Av.method.unique()['Kalman Filter', 'ERA-I', 'MDS']
Categories (3, object): ['Kalman Filter' < 'ERA-I' < 'MDS']
all_res = results_Av.query('var != "P"').groupby(['method']).agg({'rmse_stand': 'mean'}).Tall_res| method | Kalman Filter | ERA-I | MDS |
|---|---|---|---|
| rmse_stand | 0.204628 | 0.307361 | 0.482837 |
percentage of improvement across all variables
(all_res["ERA-I"] - all_res["Kalman Filter"]) / all_res["ERA-I"] * 100 rmse_stand 33.42398
dtype: float64
(all_res["MDS"] - all_res["Kalman Filter"]) / all_res["MDS"] * 100 rmse_stand 57.619542
dtype: float64
res_var = results_Av.groupby(['method', 'var']).agg({'rmse_stand': 'mean'}) res_var = res_var.reset_index().pivot(columns='method', values='rmse_stand', index='var')pd.DataFrame({'ERA': (res_var["ERA-I"] - res_var["Kalman Filter"]) / res_var["ERA-I"] * 100, 'MDS': (res_var["MDS"] - res_var["Kalman Filter"]) / res_var["MDS"] * 100 })| ERA | MDS | |
|---|---|---|
| var | ||
| TA | 54.540802 | 77.713711 |
| SW_IN | 12.004508 | 35.516142 |
| LW_IN | 5.166063 | 52.289627 |
| VPD | 44.402821 | 65.407769 |
| WS | 21.064305 | 40.321732 |
| PA | 28.784191 | 90.751559 |
| P | -18.544370 | -22.084360 |
| SWC | NaN | 41.543006 |
| TS | NaN | 25.772326 |
res_var2 = results_Av.groupby(['method', 'var', 'gap_len']).agg({'rmse_stand': 'mean'}) res_var2 = res_var2.reset_index().pivot(columns='method', values='rmse_stand', index=['var', 'gap_len'])pd.DataFrame({'ERA': (res_var2["ERA-I"] - res_var2["Kalman Filter"]) / res_var2["ERA-I"] * 100, 'MDS': (res_var2["MDS"] - res_var2["Kalman Filter"]) / res_var2["MDS"] * 100 })| ERA | MDS | ||
|---|---|---|---|
| var | gap_len | ||
| TA | 6 | 69.897582 | 85.052698 |
| 12 | 58.766166 | 79.376385 | |
| 24 | 51.538443 | 75.395970 | |
| 168 | 41.823614 | 73.000401 | |
| SW_IN | 6 | 9.519984 | 29.746651 |
| 12 | 11.165399 | 30.639223 | |
| 24 | 14.232051 | 34.811941 | |
| 168 | 12.305658 | 42.651906 | |
| LW_IN | 6 | 21.023524 | 59.136518 |
| 12 | 9.110040 | 52.211404 | |
| 24 | -3.553292 | 50.720632 | |
| 168 | -4.260023 | 48.223005 | |
| VPD | 6 | 66.980942 | 79.449579 |
| 12 | 47.785633 | 69.081018 | |
| 24 | 33.663749 | 56.728120 | |
| 168 | 32.272332 | 57.702579 | |
| WS | 6 | 32.402977 | 45.724043 |
| 12 | 25.209162 | 43.275430 | |
| 24 | 15.543672 | 37.142502 | |
| 168 | 12.735569 | 36.436106 | |
| PA | 6 | 39.823585 | 91.511486 |
| 12 | 30.995845 | 90.532461 | |
| 24 | 24.727301 | 89.319180 | |
| 168 | 20.691181 | 91.421434 | |
| P | 6 | -18.485009 | -13.917879 |
| 12 | -28.935358 | -37.127331 | |
| 24 | -24.423076 | -29.998707 | |
| 168 | -7.725322 | -11.587796 | |
| SWC | 6 | NaN | 61.302664 |
| 12 | NaN | 47.976950 | |
| 24 | NaN | 42.535719 | |
| 168 | NaN | 23.301469 | |
| TS | 6 | NaN | 64.264901 |
| 12 | NaN | 46.699870 | |
| 24 | NaN | 27.050291 | |
| 168 | NaN | -15.268479 |
Main plot
from itertools import product
import altair as altp = the_plot(results_Av)
save_show_plot(p, "the_plot")
p = the_plot_stand(results_Av)
save_show_plot(p, "the_plot_stand")
Table
t = the_table(results_Av)
the_table_latex(t, base_path_tbl / "the_table.tex", label="tbl:the_table",
caption="\\CapTheTable")
t| Kalman Filter | ERA-I | MDS | |||||
|---|---|---|---|---|---|---|---|
| RMSE | mean | std | mean | std | mean | std | |
| Variable | Gap | ||||||
| TA | 6 h | 0.405453 | 0.258301 | 1.346910 | 0.997843 | 2.712546 | 1.896914 |
| 12 h | 0.606836 | 0.400849 | 1.471695 | 0.900611 | 2.942435 | 1.748131 | |
| 1 day (24 h) | 0.741275 | 0.368468 | 1.529614 | 0.800256 | 3.012819 | 1.611311 | |
| 1 week (168 h) | 1.020608 | 0.444591 | 1.754334 | 0.643160 | 3.780087 | 1.315472 | |
| SW_IN | 6 h | 44.636609 | 40.464629 | 49.333113 | 66.241975 | 63.536627 | 85.401585 |
| 12 h | 48.155186 | 33.868178 | 54.207691 | 49.769296 | 69.427115 | 68.936352 | |
| 1 day (24 h) | 56.564277 | 30.042752 | 65.950367 | 40.930505 | 86.770917 | 59.603564 | |
| 1 week (168 h) | 61.582820 | 25.740161 | 70.224393 | 34.883199 | 107.384249 | 53.606111 | |
| LW_IN | 6 h | 10.902409 | 7.736087 | 13.804628 | 12.987987 | 26.680077 | 15.022366 |
| 12 h | 13.421656 | 7.734502 | 14.766929 | 12.584725 | 28.085478 | 13.457335 | |
| 1 day (24 h) | 14.593819 | 7.840046 | 14.093052 | 12.227900 | 29.614461 | 12.416763 | |
| 1 week (168 h) | 17.062880 | 6.425136 | 16.365697 | 11.129569 | 32.954558 | 8.833972 | |
| VPD | 6 h | 0.428187 | 0.363168 | 1.296787 | 1.547397 | 2.083592 | 2.149288 |
| 12 h | 0.660623 | 0.504761 | 1.265213 | 1.288794 | 2.136626 | 2.095549 | |
| 1 day (24 h) | 0.827563 | 0.501975 | 1.247527 | 1.032319 | 1.912472 | 1.605013 | |
| 1 week (168 h) | 1.125680 | 0.633392 | 1.662069 | 1.127314 | 2.661345 | 1.965431 | |
| WS | 6 h | 0.616774 | 0.316972 | 0.912428 | 0.508295 | 1.136367 | 0.783146 |
| 12 h | 0.715412 | 0.350974 | 0.956550 | 0.524247 | 1.261203 | 0.796744 | |
| 1 day (24 h) | 0.801851 | 0.343378 | 0.949427 | 0.446912 | 1.275665 | 0.608630 | |
| 1 week (168 h) | 0.950211 | 0.363124 | 1.088887 | 0.348541 | 1.494891 | 0.615371 | |
| PA | 6 h | 0.045046 | 0.034294 | 0.074856 | 0.061726 | 0.530665 | 0.441476 |
| 12 h | 0.053359 | 0.041613 | 0.077328 | 0.058476 | 0.563603 | 0.427426 | |
| 1 day (24 h) | 0.059481 | 0.038666 | 0.079021 | 0.051491 | 0.556899 | 0.404451 | |
| 1 week (168 h) | 0.066325 | 0.047544 | 0.083628 | 0.053654 | 0.773143 | 0.384029 | |
| P | 6 h | 0.134093 | 0.274033 | 0.113173 | 0.315504 | 0.117710 | 0.305539 |
| 12 h | 0.178871 | 0.295419 | 0.138729 | 0.297227 | 0.130442 | 0.281377 | |
| 1 day (24 h) | 0.206231 | 0.253588 | 0.165750 | 0.288432 | 0.158641 | 0.265257 | |
| 1 week (168 h) | 0.239885 | 0.173820 | 0.222682 | 0.201782 | 0.214975 | 0.197499 | |
| SWC | 6 h | 0.508379 | 0.487342 | NaN | NaN | 1.313730 | 1.556829 |
| 12 h | 0.664855 | 0.471849 | NaN | NaN | 1.278001 | 1.323011 | |
| 1 day (24 h) | 0.779066 | 0.640996 | NaN | NaN | 1.355740 | 1.472185 | |
| 1 week (168 h) | 1.493784 | 0.947799 | NaN | NaN | 1.947605 | 1.488284 | |
| TS | 6 h | 0.341080 | 0.431992 | NaN | NaN | 0.954469 | 0.889126 |
| 12 h | 0.534363 | 0.783787 | NaN | NaN | 1.002555 | 0.876784 | |
| 1 day (24 h) | 0.786670 | 0.851931 | NaN | NaN | 1.078373 | 0.856964 | |
| 1 week (168 h) | 1.659875 | 1.077782 | NaN | NaN | 1.440008 | 0.764040 | |
t = the_table(results_Av, 'rmse_stand', y_name="Stand. RMSE")
the_table_latex(t, base_path_tbl / "the_table_stand.tex", stand = True, label="tbl:the_table_stand",
caption = "\\CapTheTableStand")
t| Kalman Filter | ERA-I | MDS | |||||
|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | mean | std | mean | std | |
| Variable | Gap | ||||||
| TA | 6 h | 0.051164 | 0.032595 | 0.169965 | 0.125917 | 0.342294 | 0.239370 |
| 12 h | 0.076576 | 0.050583 | 0.185712 | 0.113647 | 0.371303 | 0.220595 | |
| 1 day (24 h) | 0.093541 | 0.046497 | 0.193021 | 0.100984 | 0.380185 | 0.203330 | |
| 1 week (168 h) | 0.128790 | 0.056103 | 0.221378 | 0.081160 | 0.477006 | 0.165998 | |
| SW_IN | 6 h | 0.218804 | 0.198354 | 0.241826 | 0.324711 | 0.311450 | 0.418630 |
| 12 h | 0.236052 | 0.166018 | 0.265721 | 0.243964 | 0.340325 | 0.337919 | |
| 1 day (24 h) | 0.277272 | 0.147267 | 0.323282 | 0.200637 | 0.425342 | 0.292171 | |
| 1 week (168 h) | 0.301873 | 0.126176 | 0.344233 | 0.170994 | 0.526387 | 0.262772 | |
| LW_IN | 6 h | 0.259855 | 0.184387 | 0.329028 | 0.309564 | 0.635910 | 0.358053 |
| 12 h | 0.319900 | 0.184349 | 0.351964 | 0.299952 | 0.669407 | 0.320751 | |
| 1 day (24 h) | 0.347838 | 0.186865 | 0.335903 | 0.291448 | 0.705850 | 0.295949 | |
| 1 week (168 h) | 0.406688 | 0.153141 | 0.390071 | 0.265269 | 0.785460 | 0.210555 | |
| VPD | 6 h | 0.098019 | 0.083135 | 0.296855 | 0.354224 | 0.476967 | 0.492006 |
| 12 h | 0.151227 | 0.115548 | 0.289627 | 0.295025 | 0.489108 | 0.479704 | |
| 1 day (24 h) | 0.189442 | 0.114910 | 0.285579 | 0.236314 | 0.437795 | 0.367413 | |
| 1 week (168 h) | 0.257686 | 0.144994 | 0.380474 | 0.258060 | 0.609224 | 0.449918 | |
| WS | 6 h | 0.379454 | 0.195008 | 0.561347 | 0.312715 | 0.699120 | 0.481810 |
| 12 h | 0.440138 | 0.215927 | 0.588492 | 0.322529 | 0.775922 | 0.490176 | |
| 1 day (24 h) | 0.493318 | 0.211254 | 0.584110 | 0.274951 | 0.784819 | 0.374443 | |
| 1 week (168 h) | 0.584592 | 0.223403 | 0.669909 | 0.214431 | 0.919692 | 0.378591 | |
| PA | 6 h | 0.052675 | 0.040103 | 0.087534 | 0.072180 | 0.620545 | 0.516250 |
| 12 h | 0.062397 | 0.048661 | 0.090425 | 0.068381 | 0.659061 | 0.499820 | |
| 1 day (24 h) | 0.069556 | 0.045215 | 0.092405 | 0.060212 | 0.651223 | 0.472953 | |
| 1 week (168 h) | 0.077558 | 0.055597 | 0.097793 | 0.062741 | 0.904092 | 0.449073 | |
| P | 6 h | 0.478431 | 0.977725 | 0.403790 | 1.125691 | 0.419979 | 1.090136 |
| 12 h | 0.638197 | 1.054031 | 0.494974 | 1.060481 | 0.465404 | 1.003928 | |
| 1 day (24 h) | 0.735816 | 0.904779 | 0.591382 | 1.029100 | 0.566018 | 0.946414 | |
| 1 week (168 h) | 0.855891 | 0.620173 | 0.794512 | 0.719941 | 0.767011 | 0.704660 | |
| SWC | 6 h | 0.057037 | 0.054677 | NaN | NaN | 0.147393 | 0.174667 |
| 12 h | 0.074593 | 0.052939 | NaN | NaN | 0.143384 | 0.148434 | |
| 1 day (24 h) | 0.087407 | 0.071916 | NaN | NaN | 0.152106 | 0.165171 | |
| 1 week (168 h) | 0.167594 | 0.106338 | NaN | NaN | 0.218510 | 0.166977 | |
| TS | 6 h | 0.060276 | 0.076342 | NaN | NaN | 0.168674 | 0.157127 |
| 12 h | 0.094433 | 0.138512 | NaN | NaN | 0.177172 | 0.154946 | |
| 1 day (24 h) | 0.139021 | 0.150554 | NaN | NaN | 0.190571 | 0.151443 | |
| 1 week (168 h) | 0.293335 | 0.190466 | NaN | NaN | 0.254479 | 0.135022 | |
Timeseries
@cache_disk(cache_dir / "the_results_ts")
def get_the_results_ts():
reset_seed()
comp_Av = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446, time_series=True, rmse=False)
results_Av = comp_Av.compare(gap_len = [12,24, 336], var=list(hai.columns), n_rep=4)
return results_Av
results_ts = get_the_results_ts()ts = plot_timeseries(results_ts.query("var in ['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS']"), idx_rep=0)
save_show_plot(ts, "timeseries_1", altair_render=True)
%time ts = plot_timeseries(results_ts.query("var in ['PA', 'P', 'TS', 'SWC']"), idx_rep=0)
%time save_show_plot(ts, "timeseries_2", altair_render=True)CPU times: user 2.93 s, sys: 2.9 ms, total: 2.93 s
Wall time: 2.95 s
CPU times: user 8.44 s, sys: 74.3 ms, total: 8.52 s
Wall time: 12.3 s

from tqdm.auto import tqdm# @cache_disk(cache_dir / "ts_plot", rm_cache=True)
def plot_additional_ts():
for idx in tqdm(results_ts.idx_rep.unique()):
if idx == 0: continue # skip first plot as is done above
ts1 = plot_timeseries(results_ts.query("var in ['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS']"), idx_rep=idx)
save_show_plot(ts1, f"timeseries_1_{idx}", altair_render=True)
ts2 = plot_timeseries(results_ts.query("var in ['PA', 'P', 'TS', 'SWC']"), idx_rep=idx)
save_show_plot(ts2, f"timeseries_2_{idx}", altair_render=True) plot_additional_ts()Kalman Filter analysis
Gap len
@cache_disk("gap_len")
def get_g_len(n_rep=n_rep):
reset_seed()
return KalmanImpComparison(models_var, hai, hai_era, block_len=48*7+100).compare(gap_len = [2,6,12,24,48,48*2, 48*3, 48*7], var=list(hai.columns), n_rep=n_rep)gap_len = get_g_len(n_rep)p = plot_gap_len(gap_len, hai, hai_era)
save_show_plot(p, "gap_len")
t = table_gap_len(gap_len)
table_gap_len_latex(t, base_path_tbl / "gap_len.tex", label="gap_len",
caption="\\CapGapLen")
t| Gap | 1 h | 3 h | 6 h | 12 h | 1 day (24 h) | 2 days (48 h) | 3 days (72 h) | 1 week (168 h) | |
|---|---|---|---|---|---|---|---|---|---|
| Variable | Stand. RMSE | ||||||||
| TA | mean | 0.025327 | 0.034172 | 0.055041 | 0.077722 | 0.107172 | 0.115631 | 0.119447 | 0.124932 |
| std | 0.022582 | 0.022085 | 0.037678 | 0.048111 | 0.059004 | 0.055879 | 0.052048 | 0.042619 | |
| SW_IN | mean | 0.141258 | 0.185323 | 0.219323 | 0.242353 | 0.283377 | 0.279351 | 0.286491 | 0.295492 |
| std | 0.164966 | 0.180712 | 0.192019 | 0.162735 | 0.139278 | 0.130424 | 0.125494 | 0.120383 | |
| LW_IN | mean | 0.122415 | 0.199663 | 0.251175 | 0.355663 | 0.363577 | 0.401382 | 0.399580 | 0.426373 |
| std | 0.159722 | 0.171639 | 0.181869 | 0.226522 | 0.188607 | 0.195204 | 0.185700 | 0.164282 | |
| VPD | mean | 0.044495 | 0.072448 | 0.106393 | 0.185943 | 0.203379 | 0.226881 | 0.238030 | 0.258977 |
| std | 0.050343 | 0.092523 | 0.093396 | 0.176841 | 0.135613 | 0.141761 | 0.133105 | 0.148695 | |
| WS | mean | 0.236801 | 0.305489 | 0.371760 | 0.473200 | 0.505951 | 0.551268 | 0.530790 | 0.564939 |
| std | 0.198199 | 0.193005 | 0.202284 | 0.273506 | 0.285058 | 0.238593 | 0.200557 | 0.195587 | |
| PA | mean | 0.026844 | 0.038072 | 0.055131 | 0.063885 | 0.068426 | 0.075262 | 0.074562 | 0.080174 |
| std | 0.026776 | 0.029629 | 0.065971 | 0.039537 | 0.050700 | 0.058926 | 0.055454 | 0.059972 | |
| P | mean | 0.246141 | 0.550198 | 0.443231 | 0.722287 | 0.693768 | 0.729318 | 0.850642 | 0.907571 |
| std | 0.764003 | 1.821625 | 0.525170 | 1.351510 | 0.782852 | 0.619818 | 0.749813 | 0.583107 | |
| SWC | mean | 0.019637 | 0.038882 | 0.053718 | 0.067853 | 0.087583 | 0.098537 | 0.113108 | 0.165704 |
| std | 0.016406 | 0.034985 | 0.036017 | 0.042192 | 0.062350 | 0.072180 | 0.086248 | 0.099118 | |
| TS | mean | 0.025595 | 0.044131 | 0.067155 | 0.105208 | 0.129654 | 0.185770 | 0.209769 | 0.315963 |
| std | 0.026281 | 0.042837 | 0.070520 | 0.144570 | 0.118676 | 0.147786 | 0.139939 | 0.218048 |
g_len_agg = gap_len.groupby('gap_len').agg({'rmse_stand': 'mean'})
(g_len_agg.iloc[0])/g_len_agg.iloc[-1]rmse_stand 0.282954
dtype: float64
g_len_agg = gap_len.groupby(['gap_len', 'var']).agg({'rmse_stand': 'mean'})
(g_len_agg.loc[1.])/g_len_agg.loc[168.]| rmse_stand | |
|---|---|
| var | |
| TA | 0.202725 |
| SW_IN | 0.478043 |
| LW_IN | 0.287107 |
| VPD | 0.171809 |
| WS | 0.419162 |
| PA | 0.334820 |
| P | 0.271208 |
| SWC | 0.118507 |
| TS | 0.081006 |
g_len_agg| rmse_stand | ||
|---|---|---|
| gap_len | var | |
| 1 | TA | 0.025327 |
| SW_IN | 0.141258 | |
| LW_IN | 0.122415 | |
| VPD | 0.044495 | |
| WS | 0.236801 | |
| ... | ... | ... |
| 168 | WS | 0.564939 |
| PA | 0.080174 | |
| P | 0.907571 | |
| SWC | 0.165704 | |
| TS | 0.315963 |
72 rows × 1 columns
g_len_agg_std = gap_len.groupby('gap_len').agg({'rmse_stand': 'std'})
(g_len_agg_std.iloc[0])/g_len_agg_std.iloc[-1]rmse_stand 0.849176
dtype: float64
(gap_len.groupby(['gap_len', 'var']).agg({'rmse_stand': 'std'})
.unstack("var")
.droplevel(0, 1)
.plot(subplots=True, layout=(3,3), figsize=(10,10)))array([[<AxesSubplot: xlabel='gap_len'>, <AxesSubplot: xlabel='gap_len'>,
<AxesSubplot: xlabel='gap_len'>],
[<AxesSubplot: xlabel='gap_len'>, <AxesSubplot: xlabel='gap_len'>,
<AxesSubplot: xlabel='gap_len'>],
[<AxesSubplot: xlabel='gap_len'>, <AxesSubplot: xlabel='gap_len'>,
<AxesSubplot: xlabel='gap_len'>]], dtype=object)

# with open(base_path_tbl / "gap_len.tex") as f:
# print(f.readlines())Control
models_nc = pd.DataFrame({'model': [ l_model("1_gap_varying_336_no_control_v1.pickle"), l_model("1_gap_varying_6-336_v3.pickle")],
'type': [ 'No Control', 'Use Control' ]}) @cache_disk("use_control")
def get_control(n_rep=n_rep):
reset_seed()
kcomp_control = KalmanImpComparison(models_nc, hai, hai_era, block_len=100+48*7)
k_results_control = kcomp_control.compare(n_rep =n_rep, gap_len = [12, 24, 48, 48*7], var = list(hai.columns))
return k_results_controlfrom time import sleepk_results_control = get_control(n_rep)k_results_control| var | loss | likelihood | gap_len | idx_rep | type | rmse | rmse_stand | gap_len_f | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | TA | -5.742477 | 2.142377 | 6 | 0 | No Control | 0.227680 | 0.028731 | 6 h |
| 1 | TA | -5.755538 | 2.148783 | 6 | 1 | No Control | 0.228037 | 0.028776 | 6 h |
| 2 | TA | -1.232816 | 2.132641 | 6 | 2 | No Control | 1.813357 | 0.228826 | 6 h |
| 3 | TA | -4.693027 | 2.118098 | 6 | 3 | No Control | 0.906512 | 0.114392 | 6 h |
| 4 | TA | -5.179578 | 2.151546 | 6 | 4 | No Control | 0.726325 | 0.091654 | 6 h |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | TS | 115.193441 | 0.966260 | 168 | 495 | Use Control | 1.770984 | 0.312970 | 1 week (168 h) |
| 996 | TS | 82.913345 | 1.474181 | 168 | 496 | Use Control | 1.403469 | 0.248022 | 1 week (168 h) |
| 997 | TS | 93.694508 | 1.154413 | 168 | 497 | Use Control | 1.526679 | 0.269796 | 1 week (168 h) |
| 998 | TS | 118.712986 | 1.494637 | 168 | 498 | Use Control | 1.798757 | 0.317878 | 1 week (168 h) |
| 999 | TS | 93.694508 | 1.154413 | 168 | 499 | Use Control | 1.526679 | 0.269796 | 1 week (168 h) |
36000 rows × 9 columns
p = plot_compare(k_results_control, 'type', y = 'rmse', scale_domain=["Use Control", "No Control"])
save_show_plot(p, "use_control")
pfrom functools import partialt = table_compare(k_results_control, 'type')
table_compare_latex(t, base_path_tbl / "control.tex", label="tbl:control",
caption="\\CapControl")
t| type | Use Control | No Control | ||||||
|---|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | se | mean | std | se | diff. | |
| Variable | Gap | |||||||
| TA | 6 h | 0.092151 | 0.055814 | 0.002496 | 0.094594 | 0.059489 | 0.002660 | 0.002443 |
| 12 h | 0.118358 | 0.074072 | 0.003313 | 0.160712 | 0.106870 | 0.004779 | 0.042354 | |
| 1 day (24 h) | 0.146578 | 0.076799 | 0.003435 | 0.232744 | 0.159137 | 0.007117 | 0.086166 | |
| 1 week (168 h) | 0.174324 | 0.064495 | 0.002884 | 0.294072 | 0.147849 | 0.006612 | 0.119748 | |
| SW_IN | 6 h | 0.255795 | 0.164694 | 0.007365 | 0.331166 | 0.270628 | 0.012103 | 0.075371 |
| 12 h | 0.314755 | 0.164839 | 0.007372 | 0.518890 | 0.319771 | 0.014301 | 0.204135 | |
| 1 day (24 h) | 0.339310 | 0.128516 | 0.005747 | 0.608981 | 0.277150 | 0.012395 | 0.269671 | |
| 1 week (168 h) | 0.357922 | 0.103167 | 0.004614 | 0.693166 | 0.248412 | 0.011109 | 0.335244 | |
| LW_IN | 6 h | 0.299442 | 0.201432 | 0.009008 | 0.330486 | 0.185322 | 0.008288 | 0.031044 |
| 12 h | 0.367537 | 0.197000 | 0.008810 | 0.485509 | 0.193452 | 0.008651 | 0.117972 | |
| 1 day (24 h) | 0.400412 | 0.175611 | 0.007854 | 0.561446 | 0.186706 | 0.008350 | 0.161035 | |
| 1 week (168 h) | 0.466036 | 0.141424 | 0.006325 | 0.666359 | 0.161485 | 0.007222 | 0.200323 | |
| VPD | 6 h | 0.137863 | 0.104531 | 0.004675 | 0.178626 | 0.139476 | 0.006238 | 0.040762 |
| 12 h | 0.226616 | 0.183575 | 0.008210 | 0.320797 | 0.262730 | 0.011750 | 0.094181 | |
| 1 day (24 h) | 0.272840 | 0.206783 | 0.009248 | 0.407586 | 0.296989 | 0.013282 | 0.134746 | |
| 1 week (168 h) | 0.325113 | 0.154287 | 0.006900 | 0.499408 | 0.252086 | 0.011274 | 0.174294 | |
| WS | 6 h | 0.571539 | 0.523304 | 0.023403 | 0.429625 | 0.262009 | 0.011717 | -0.141914 |
| 12 h | 0.660302 | 0.408751 | 0.018280 | 0.680223 | 0.391765 | 0.017520 | 0.019921 | |
| 1 day (24 h) | 0.676472 | 0.374177 | 0.016734 | 0.767313 | 0.430367 | 0.019247 | 0.090841 | |
| 1 week (168 h) | 0.781636 | 0.264823 | 0.011843 | 0.888977 | 0.318094 | 0.014226 | 0.107340 | |
| PA | 6 h | 0.099325 | 0.066109 | 0.002956 | 0.144519 | 0.105120 | 0.004701 | 0.045194 |
| 12 h | 0.116360 | 0.061026 | 0.002729 | 0.338753 | 0.209793 | 0.009382 | 0.222393 | |
| 1 day (24 h) | 0.136620 | 0.076087 | 0.003403 | 0.562996 | 0.378688 | 0.016935 | 0.426376 | |
| 1 week (168 h) | 0.158988 | 0.068315 | 0.003055 | 0.879748 | 0.439831 | 0.019670 | 0.720760 | |
| P | 6 h | 0.542198 | 0.990843 | 0.044312 | 0.507613 | 0.973715 | 0.043546 | -0.034584 |
| 12 h | 0.627167 | 1.034936 | 0.046284 | 0.605429 | 1.025064 | 0.045842 | -0.021738 | |
| 1 day (24 h) | 0.628179 | 0.575428 | 0.025734 | 0.614481 | 0.583966 | 0.026116 | -0.013698 | |
| 1 week (168 h) | 0.880507 | 0.572737 | 0.025614 | 0.860985 | 0.597007 | 0.026699 | -0.019522 | |
| SWC | 6 h | 0.152144 | 0.107350 | 0.004801 | 0.151053 | 0.141370 | 0.006322 | -0.001092 |
| 12 h | 0.244768 | 0.154341 | 0.006902 | 0.305445 | 0.233699 | 0.010451 | 0.060678 | |
| 1 day (24 h) | 0.392553 | 0.222757 | 0.009962 | 0.538003 | 0.337738 | 0.015104 | 0.145450 | |
| 1 week (168 h) | 0.719138 | 0.314318 | 0.014057 | 0.842404 | 0.431371 | 0.019292 | 0.123267 | |
| TS | 6 h | 0.168111 | 0.114636 | 0.005127 | 0.118928 | 0.076226 | 0.003409 | -0.049183 |
| 12 h | 0.226302 | 0.134730 | 0.006025 | 0.203284 | 0.132946 | 0.005946 | -0.023019 | |
| 1 day (24 h) | 0.286258 | 0.152198 | 0.006807 | 0.285367 | 0.174358 | 0.007798 | -0.000891 | |
| 1 week (168 h) | 0.362279 | 0.176402 | 0.007889 | 0.359044 | 0.191313 | 0.008556 | -0.003235 | |
Gap in Other variables
models_gap_single = pd.DataFrame.from_records([
{'Gap': 'All variables', 'gap_single_var': False, 'model': l_model("all_varying_gap_varying_len_6-30_v3.pickle")},
{'Gap': 'Only one var', 'gap_single_var': True, 'model': l_model("all_varying_gap_varying_len_6-30_v3.pickle")},
])@cache_disk("gap_single")
def get_gap_single(n_rep):
kcomp_single = KalmanImpComparison(models_gap_single, hai, hai_era, block_len=130)
return kcomp_single.compare(n_rep =n_rep, gap_len = [6, 12, 24, 30], var = list(hai.columns))res_single = get_gap_single(n_rep)p = plot_compare(res_single, "Gap", y = 'rmse', scale_domain=["Only one var", "All variables"])
save_show_plot(p, "gap_single_var")
t = table_compare(res_single, 'Gap')
table_compare_latex(t, base_path_tbl / "gap_single_var.tex", caption="\\CapGapSingle", label="tbl:gap_single_var")
t| Gap | Only one var | All variables | ||||||
|---|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | se | mean | std | se | diff. | |
| Variable | Gap | |||||||
| TA | 3 h | 0.029116 | 0.026433 | 0.001182 | 0.048143 | 0.040335 | 0.001804 | 0.019027 |
| 6 h | 0.042861 | 0.031811 | 0.001423 | 0.080906 | 0.062887 | 0.002812 | 0.038045 | |
| 12 h | 0.071487 | 0.049631 | 0.002220 | 0.124425 | 0.096532 | 0.004317 | 0.052939 | |
| 15 h | 0.079382 | 0.053998 | 0.002415 | 0.135941 | 0.087736 | 0.003924 | 0.056559 | |
| SW_IN | 3 h | 0.198721 | 0.204619 | 0.009151 | 0.196330 | 0.239872 | 0.010727 | -0.002390 |
| 6 h | 0.230639 | 0.190542 | 0.008521 | 0.247159 | 0.234288 | 0.010478 | 0.016520 | |
| 12 h | 0.268135 | 0.173156 | 0.007744 | 0.279729 | 0.220169 | 0.009846 | 0.011595 | |
| 15 h | 0.254110 | 0.137783 | 0.006162 | 0.275088 | 0.184996 | 0.008273 | 0.020978 | |
| LW_IN | 3 h | 0.198619 | 0.171501 | 0.007670 | 0.204255 | 0.192592 | 0.008613 | 0.005636 |
| 6 h | 0.288557 | 0.205514 | 0.009191 | 0.285575 | 0.220568 | 0.009864 | -0.002982 | |
| 12 h | 0.336802 | 0.211468 | 0.009457 | 0.346164 | 0.234498 | 0.010487 | 0.009362 | |
| 15 h | 0.339017 | 0.183203 | 0.008193 | 0.343338 | 0.211743 | 0.009469 | 0.004321 | |
| VPD | 3 h | 0.063849 | 0.064851 | 0.002900 | 0.096974 | 0.101109 | 0.004522 | 0.033125 |
| 6 h | 0.096670 | 0.092415 | 0.004133 | 0.154050 | 0.152273 | 0.006810 | 0.057380 | |
| 12 h | 0.163176 | 0.134966 | 0.006036 | 0.235416 | 0.209535 | 0.009371 | 0.072241 | |
| 15 h | 0.170890 | 0.147536 | 0.006598 | 0.257427 | 0.230171 | 0.010294 | 0.086537 | |
| WS | 3 h | 0.299512 | 0.185594 | 0.008300 | 0.300374 | 0.185026 | 0.008275 | 0.000862 |
| 6 h | 0.394661 | 0.234428 | 0.010484 | 0.396053 | 0.237782 | 0.010634 | 0.001392 | |
| 12 h | 0.479464 | 0.249523 | 0.011159 | 0.499147 | 0.264118 | 0.011812 | 0.019682 | |
| 15 h | 0.470032 | 0.218304 | 0.009763 | 0.481117 | 0.234884 | 0.010504 | 0.011084 | |
| PA | 3 h | 0.024250 | 0.015962 | 0.000714 | 0.028151 | 0.017252 | 0.000772 | 0.003902 |
| 6 h | 0.036931 | 0.024932 | 0.001115 | 0.042390 | 0.031342 | 0.001402 | 0.005459 | |
| 12 h | 0.050980 | 0.025147 | 0.001125 | 0.055303 | 0.026073 | 0.001166 | 0.004323 | |
| 15 h | 0.054699 | 0.037828 | 0.001692 | 0.057890 | 0.042331 | 0.001893 | 0.003191 | |
| P | 3 h | 0.338024 | 0.665541 | 0.029764 | 0.322645 | 0.525352 | 0.023494 | -0.015379 |
| 6 h | 0.380737 | 0.804932 | 0.035998 | 0.405015 | 0.923454 | 0.041298 | 0.024277 | |
| 12 h | 0.428597 | 0.572176 | 0.025588 | 0.455575 | 0.680281 | 0.030423 | 0.026978 | |
| 15 h | 0.484958 | 1.048699 | 0.046899 | 0.507848 | 1.212676 | 0.054232 | 0.022889 | |
| SWC | 3 h | 0.012567 | 0.019060 | 0.000852 | 0.013736 | 0.023656 | 0.001058 | 0.001169 |
| 6 h | 0.017016 | 0.026366 | 0.001179 | 0.021119 | 0.033282 | 0.001488 | 0.004103 | |
| 12 h | 0.029611 | 0.049173 | 0.002199 | 0.033421 | 0.048640 | 0.002175 | 0.003810 | |
| 15 h | 0.031970 | 0.045392 | 0.002030 | 0.039874 | 0.055206 | 0.002469 | 0.007904 | |
| TS | 3 h | 0.028479 | 0.024754 | 0.001107 | 0.032860 | 0.028891 | 0.001292 | 0.004381 |
| 6 h | 0.045677 | 0.047974 | 0.002145 | 0.061144 | 0.057966 | 0.002592 | 0.015467 | |
| 12 h | 0.088208 | 0.091400 | 0.004088 | 0.114275 | 0.118003 | 0.005277 | 0.026068 | |
| 15 h | 0.101862 | 0.103646 | 0.004635 | 0.131989 | 0.130062 | 0.005817 | 0.030128 | |
res_singl_perc = res_single.groupby(['Gap', 'var', 'gap_len']).agg({'rmse_stand': 'mean'}).reset_index().pivot(columns = 'Gap', values='rmse_stand', index=['var', 'gap_len'])pd.DataFrame({'Only one var': (res_singl_perc["All variables"] - res_singl_perc["Only one var"]) / res_singl_perc["All variables"] * 100})| Only one var | ||
|---|---|---|
| var | gap_len | |
| TA | 3 | 39.521284 |
| 6 | 47.023566 | |
| 12 | 42.546605 | |
| 15 | 41.605866 | |
| SW_IN | 3 | -1.217559 |
| 6 | 6.684132 | |
| 12 | 4.144939 | |
| 15 | 7.625968 | |
| LW_IN | 3 | 2.759433 |
| 6 | -1.044157 | |
| 12 | 2.704600 | |
| 15 | 1.258656 | |
| VPD | 3 | 34.158966 |
| 6 | 37.247869 | |
| 12 | 30.686333 | |
| 15 | 33.616012 | |
| WS | 3 | 0.286928 |
| 6 | 0.351345 | |
| 12 | 3.943177 | |
| 15 | 2.303891 | |
| PA | 3 | 13.860326 |
| 6 | 12.877279 | |
| 12 | 7.816283 | |
| 15 | 5.511460 | |
| P | 3 | -4.766508 |
| 6 | 5.994219 | |
| 12 | 5.921732 | |
| 15 | 4.507125 | |
| SWC | 3 | 8.507126 |
| 6 | 19.428107 | |
| 12 | 11.400826 | |
| 15 | 19.823475 | |
| TS | 3 | 13.331333 |
| 6 | 25.295932 | |
| 12 | 22.811204 | |
| 15 | 22.825776 |
res_singl_perc = res_single.groupby(['Gap', 'var']).agg({'rmse_stand': 'mean'}).reset_index().pivot(columns = 'Gap', values='rmse_stand', index=['var'])pd.DataFrame({'Only one var': (res_singl_perc["All variables"] - res_singl_perc["Only one var"]) / res_singl_perc["All variables"] * 100})| Only one var | |
|---|---|
| var | |
| TA | 42.774328 |
| SW_IN | 4.678197 |
| LW_IN | 1.385379 |
| VPD | 33.511756 |
| WS | 1.969357 |
| PA | 9.183783 |
| P | 3.475042 |
| SWC | 15.706237 |
| TS | 22.347878 |
Generic vs Specialized
models_generic = models_var.copy()models_generic.model = l_model("1_gap_varying_6-336_v3.pickle")
models_generic['type'] = 'Generic'models_generic| var | model | type | |
|---|---|---|---|
| 0 | TA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 1 | SW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 2 | LW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 3 | VPD | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 4 | WS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 5 | PA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 6 | P | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 7 | TS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 8 | SWC | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
models_var['type'] = 'Fine-tuned one var'models_gen_vs_spec = pd.concat([models_generic, models_var])models_gen_vs_spec| var | model | type | |
|---|---|---|---|
| 0 | TA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 1 | SW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 2 | LW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 3 | VPD | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 4 | WS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 5 | PA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 6 | P | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 7 | TS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 8 | SWC | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 0 | TA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 1 | SW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 2 | LW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 3 | VPD | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 4 | WS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 5 | PA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 6 | P | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 7 | TS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 8 | SWC | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
@cache_disk("generic")
def get_generic(n_rep=n_rep):
reset_seed()
comp_generic = KalmanImpComparison(models_gen_vs_spec, hai, hai_era, block_len=100+48*7)
return comp_generic.compare(n_rep =n_rep, gap_len = [12, 24, 48, 48*7], var = list(hai.columns))
k_results_generic = get_generic(n_rep)plot_formatter.legend_symbol_size = 300p = plot_compare(k_results_generic, 'type', y = 'rmse', scale_domain=["Fine-tuned one var", "Generic"])
save_show_plot(p, "generic")
pt = table_compare(k_results_generic, 'type')
table_compare_latex(t, base_path_tbl / "generic.tex", label='tbl:generic', caption="\\CapGeneric")
t| type | Generic | Fine-tuned one var | ||||||
|---|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | se | mean | std | se | diff. | |
| Variable | Gap | |||||||
| TA | 6 h | 0.092151 | 0.055814 | 0.002496 | 0.054999 | 0.032663 | 0.001461 | -0.037153 |
| 12 h | 0.118358 | 0.074072 | 0.003313 | 0.077345 | 0.052042 | 0.002327 | -0.041013 | |
| 1 day (24 h) | 0.146578 | 0.076799 | 0.003435 | 0.100970 | 0.058986 | 0.002638 | -0.045608 | |
| 1 week (168 h) | 0.174324 | 0.064495 | 0.002884 | 0.128855 | 0.048376 | 0.002163 | -0.045469 | |
| SW_IN | 6 h | 0.255795 | 0.164694 | 0.007365 | 0.208728 | 0.167255 | 0.007480 | -0.047067 |
| 12 h | 0.314755 | 0.164839 | 0.007372 | 0.257761 | 0.163914 | 0.007330 | -0.056994 | |
| 1 day (24 h) | 0.339310 | 0.128516 | 0.005747 | 0.280578 | 0.131607 | 0.005886 | -0.058733 | |
| 1 week (168 h) | 0.357922 | 0.103167 | 0.004614 | 0.297530 | 0.118822 | 0.005314 | -0.060391 | |
| LW_IN | 6 h | 0.299442 | 0.201432 | 0.009008 | 0.256844 | 0.199753 | 0.008933 | -0.042598 |
| 12 h | 0.367537 | 0.197000 | 0.008810 | 0.326104 | 0.209362 | 0.009363 | -0.041433 | |
| 1 day (24 h) | 0.400412 | 0.175611 | 0.007854 | 0.358755 | 0.185939 | 0.008315 | -0.041657 | |
| 1 week (168 h) | 0.466036 | 0.141424 | 0.006325 | 0.405833 | 0.165308 | 0.007393 | -0.060203 | |
| VPD | 6 h | 0.137863 | 0.104531 | 0.004675 | 0.096682 | 0.089465 | 0.004001 | -0.041182 |
| 12 h | 0.226616 | 0.183575 | 0.008210 | 0.169084 | 0.164870 | 0.007373 | -0.057532 | |
| 1 day (24 h) | 0.272840 | 0.206783 | 0.009248 | 0.202490 | 0.159921 | 0.007152 | -0.070350 | |
| 1 week (168 h) | 0.325113 | 0.154287 | 0.006900 | 0.263913 | 0.153488 | 0.006864 | -0.061201 | |
| WS | 6 h | 0.571539 | 0.523304 | 0.023403 | 0.365262 | 0.200693 | 0.008975 | -0.206276 |
| 12 h | 0.660302 | 0.408751 | 0.018280 | 0.481919 | 0.265006 | 0.011851 | -0.178383 | |
| 1 day (24 h) | 0.676472 | 0.374177 | 0.016734 | 0.510818 | 0.270480 | 0.012096 | -0.165653 | |
| 1 week (168 h) | 0.781636 | 0.264823 | 0.011843 | 0.569137 | 0.189984 | 0.008496 | -0.212499 | |
| PA | 6 h | 0.099325 | 0.066109 | 0.002956 | 0.054017 | 0.032223 | 0.001441 | -0.045308 |
| 12 h | 0.116360 | 0.061026 | 0.002729 | 0.059966 | 0.030824 | 0.001378 | -0.056394 | |
| 1 day (24 h) | 0.136620 | 0.076087 | 0.003403 | 0.072074 | 0.068788 | 0.003076 | -0.064546 | |
| 1 week (168 h) | 0.158988 | 0.068315 | 0.003055 | 0.081116 | 0.053864 | 0.002409 | -0.077872 | |
| P | 6 h | 0.542198 | 0.990843 | 0.044312 | 0.542198 | 0.990843 | 0.044312 | 0.000000 |
| 12 h | 0.627167 | 1.034936 | 0.046284 | 0.627167 | 1.034936 | 0.046284 | 0.000000 | |
| 1 day (24 h) | 0.628179 | 0.575428 | 0.025734 | 0.628179 | 0.575428 | 0.025734 | 0.000000 | |
| 1 week (168 h) | 0.880507 | 0.572737 | 0.025614 | 0.880507 | 0.572737 | 0.025614 | 0.000000 | |
| SWC | 6 h | 0.152144 | 0.107350 | 0.004801 | 0.050597 | 0.033592 | 0.001502 | -0.101547 |
| 12 h | 0.244768 | 0.154341 | 0.006902 | 0.067366 | 0.043308 | 0.001937 | -0.177402 | |
| 1 day (24 h) | 0.392553 | 0.222757 | 0.009962 | 0.078784 | 0.053435 | 0.002390 | -0.313769 | |
| 1 week (168 h) | 0.719138 | 0.314318 | 0.014057 | 0.169313 | 0.103684 | 0.004637 | -0.549825 | |
| TS | 6 h | 0.168111 | 0.114636 | 0.005127 | 0.065352 | 0.068001 | 0.003041 | -0.102759 |
| 12 h | 0.226302 | 0.134730 | 0.006025 | 0.088370 | 0.087491 | 0.003913 | -0.137932 | |
| 1 day (24 h) | 0.286258 | 0.152198 | 0.006807 | 0.129457 | 0.127674 | 0.005710 | -0.156801 | |
| 1 week (168 h) | 0.362279 | 0.176402 | 0.007889 | 0.316745 | 0.217719 | 0.009737 | -0.045534 | |
res_singl_perc = k_results_generic.groupby(['type', 'var']).agg({'rmse_stand': 'mean'}).reset_index().pivot(columns = 'type', values='rmse_stand', index=['var'])(res_singl_perc["Generic"] - res_singl_perc["Fine-tuned one var"]) / res_singl_perc["Generic"] * 100var
TA 31.847749
SW_IN 17.604408
LW_IN 12.122588
VPD 23.925198
WS 28.357855
PA 47.745394
P 0.000000
SWC 75.735168
TS 42.478176
dtype: float64
Training
models_train = pd.DataFrame.from_records([
# {'Train': 'All variables', 'model': l_model("All_gap_all_30_v1.pickle") },
{'Train': 'Only one var', 'model': l_model("1_gap_varying_6-336_v3.pickle") },
{'Train': 'Multi vars', 'model': l_model("all_varying_gap_varying_len_6-30_v3.pickle") },
{'Train': 'Random params', 'model': l_model("rand_all_varying_gap_varying_len_6-30_v4.pickle") }
])models_train| Train | model | |
|---|---|---|
| 0 | Only one var | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 |
| 1 | Multi vars | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 |
| 2 | Random params | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 |
@cache_disk("train")
def get_train(n_rep):
reset_seed()
kcomp = KalmanImpComparison(models_train, hai, hai_era, block_len=130)
return kcomp.compare(n_rep =n_rep, gap_len = [6, 12, 24, 30], var = list(hai.columns))res_train = get_train(n_rep)res_train_agg = res_train.groupby(['Train', 'gap_len']).agg({'rmse_stand': 'mean'}).reset_index()res_train_agg| Train | gap_len | rmse_stand | |
|---|---|---|---|
| 0 | Multi vars | 3 | 0.131760 |
| 1 | Multi vars | 6 | 0.166202 |
| 2 | Multi vars | 12 | 0.215978 |
| 3 | Multi vars | 15 | 0.225579 |
| 4 | Only one var | 3 | 0.192683 |
| 5 | Only one var | 6 | 0.250933 |
| 6 | Only one var | 12 | 0.332169 |
| 7 | Only one var | 15 | 0.340622 |
| 8 | Random params | 3 | 0.200750 |
| 9 | Random params | 6 | 0.236855 |
| 10 | Random params | 12 | 0.292697 |
| 11 | Random params | 15 | 0.294936 |
p = plot_compare(res_train, "Train", y='rmse', scale_domain=["Multi vars", "Only one var", "Random params"])
save_show_plot(p, "train_compare")
t = table_compare3(res_train, 'Train')
table_compare3_latex(t, base_path_tbl / "train.tex", label="tbl:train_compare", caption="\\CapTrain")
t| Train | Multi vars | Only one var | Random params | ||||
|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | mean | std | mean | std | |
| Variable | Gap [$h$] | ||||||
| TA | 3 h | 0.028752 | 0.023775 | 0.066495 | 0.049728 | 0.060459 | 0.045142 |
| 6 h | 0.044355 | 0.032665 | 0.092593 | 0.062233 | 0.079903 | 0.055433 | |
| 12 h | 0.072632 | 0.049516 | 0.133494 | 0.091525 | 0.117427 | 0.085482 | |
| 15 h | 0.077183 | 0.051459 | 0.130566 | 0.071501 | 0.111204 | 0.067554 | |
| SW_IN | 3 h | 0.178118 | 0.175674 | 0.201927 | 0.178423 | 0.259459 | 0.219808 |
| 6 h | 0.210377 | 0.170297 | 0.249492 | 0.175702 | 0.282919 | 0.209136 | |
| 12 h | 0.268875 | 0.172214 | 0.307884 | 0.167333 | 0.333024 | 0.192553 | |
| 15 h | 0.269610 | 0.165685 | 0.315006 | 0.164416 | 0.333722 | 0.187786 | |
| LW_IN | 3 h | 0.200445 | 0.169263 | 0.215081 | 0.177105 | 0.298262 | 0.202726 |
| 6 h | 0.269851 | 0.215872 | 0.296780 | 0.229277 | 0.341691 | 0.228729 | |
| 12 h | 0.331508 | 0.224076 | 0.376745 | 0.241035 | 0.398407 | 0.259110 | |
| 15 h | 0.356784 | 0.208411 | 0.389550 | 0.207828 | 0.398491 | 0.224671 | |
| VPD | 3 h | 0.064145 | 0.060110 | 0.097167 | 0.079462 | 0.150077 | 0.127847 |
| 6 h | 0.098231 | 0.096103 | 0.147885 | 0.112805 | 0.193383 | 0.174094 | |
| 12 h | 0.168481 | 0.155672 | 0.222713 | 0.160915 | 0.240388 | 0.192226 | |
| 15 h | 0.185879 | 0.158725 | 0.241190 | 0.167722 | 0.254005 | 0.210471 | |
| WS | 3 h | 0.305451 | 0.182717 | 0.471322 | 0.459590 | 0.447337 | 0.299554 |
| 6 h | 0.371935 | 0.210461 | 0.575585 | 0.465884 | 0.481752 | 0.308642 | |
| 12 h | 0.429368 | 0.195014 | 0.649132 | 0.481338 | 0.523181 | 0.269521 | |
| 15 h | 0.478824 | 0.230152 | 0.690806 | 0.456658 | 0.540468 | 0.292931 | |
| PA | 3 h | 0.024824 | 0.018937 | 0.073198 | 0.061610 | 0.067156 | 0.056764 |
| 6 h | 0.035399 | 0.020712 | 0.094714 | 0.061088 | 0.097657 | 0.076961 | |
| 12 h | 0.051464 | 0.045255 | 0.117277 | 0.073159 | 0.118236 | 0.089099 | |
| 15 h | 0.058828 | 0.039719 | 0.127520 | 0.079590 | 0.114797 | 0.076361 | |
| P | 3 h | 0.348034 | 0.680361 | 0.411050 | 1.509773 | 0.370699 | 0.713138 |
| 6 h | 0.399553 | 0.617684 | 0.486592 | 0.718194 | 0.425397 | 0.704196 | |
| 12 h | 0.511262 | 0.851207 | 0.706403 | 1.150406 | 0.535496 | 0.980554 | |
| 15 h | 0.453258 | 0.597555 | 0.618186 | 0.755660 | 0.469612 | 0.698131 | |
| SWC | 3 h | 0.012302 | 0.029214 | 0.095440 | 0.075541 | 0.106073 | 0.074970 |
| 6 h | 0.018203 | 0.030712 | 0.150918 | 0.108401 | 0.153282 | 0.106928 | |
| 12 h | 0.027391 | 0.036781 | 0.245083 | 0.160379 | 0.246170 | 0.215060 | |
| 15 h | 0.033106 | 0.041433 | 0.282246 | 0.227894 | 0.278409 | 0.208012 | |
| TS | 3 h | 0.023765 | 0.017708 | 0.102470 | 0.082133 | 0.047228 | 0.037464 |
| 6 h | 0.047910 | 0.048379 | 0.163837 | 0.105534 | 0.075711 | 0.073902 | |
| 12 h | 0.082818 | 0.086185 | 0.230788 | 0.157936 | 0.121944 | 0.134360 | |
| 15 h | 0.116737 | 0.128579 | 0.270524 | 0.190755 | 0.153716 | 0.160674 | |
Extra results
Standard deviations
hai_std = hai.std().to_frame(name='std')
hai_std.index.name = "Variable"
hai_std = hai_std.reset_index().assign(unit=[f"\\si{{{unit}}}" for unit in units_big.values()])hai_std| Variable | std | unit | |
|---|---|---|---|
| 0 | TA | 7.924611 | \si{°C} |
| 1 | SW_IN | 204.002561 | \si{W m-2} |
| 2 | LW_IN | 41.955741 | \si{hPa} |
| 3 | VPD | 4.368417 | \si{hPa} |
| 4 | WS | 1.625425 | \si{mm} |
| 5 | PA | 0.855159 | \si{m s-1} |
| 6 | P | 0.280276 | \si{W m-2} |
| 7 | SWC | 8.913119 | \si{%} |
| 8 | TS | 5.658643 | \si{°C} |
latex = hai_std.style.hide(axis="index").format(precision=3).to_latex(hrules=True, caption="\\CapStd", label="tbl:hai_std", position_float="centering")
with open(base_path_tbl / "hai_std.tex", 'w') as f:
f.write(latex)Gap distribution
out_dir = here("../fluxnet/gap_stat")site_info = pl.read_parquet(out_dir / "../site_info.parquet").select([
pl.col("start").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M"),
pl.col("end").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M"),
pl.col("site").cast(pl.Categorical).sort()
])def duration_n_obs(duration):
"converts a duration into a n of fluxnet observations"
return abs(int(duration.total_seconds() / (30 * 60)))files = out_dir.ls()
files.sort() # need to sort to match the site_info
sites = []
for i, path in enumerate(files):
sites.append(pl.scan_parquet(path).with_columns([
pl.lit(site_info[i, "site"]).alias("site"),
pl.lit(duration_n_obs(site_info[i, "start"] - site_info[i, "end"])).alias("total_obs"),
pl.col("TIMESTAMP_END").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M").alias("end"),
]).drop("TIMESTAMP_END"))
gap_stat = pl.concat(sites)pl.read_parquet(files[0])
shape: (31574, 3)
TIMESTAMP_END
gap_len
variable
i64
u32
str
200901010030
16992
"TA_F_MDS_QC"
200912211100
5
"TA_F_MDS_QC"
200912211700
1
"TA_F_MDS_QC"
201001061300
1
"TA_F_MDS_QC"
201001071300
3
"TA_F_MDS_QC"
201001081130
2
"TA_F_MDS_QC"
201001081830
1
"TA_F_MDS_QC"
201001091000
2
"TA_F_MDS_QC"
201001191000
1
"TA_F_MDS_QC"
201001191130
2
"TA_F_MDS_QC"
201001291900
1
"TA_F_MDS_QC"
201002131030
2
"TA_F_MDS_QC"
...
...
...
201103241630
3
"NEE_VUT_95_QC"
201103241930
29
"NEE_VUT_95_QC"
201103251900
8
"NEE_VUT_95_QC"
201103260030
1
"NEE_VUT_95_QC"
201103260200
1
"NEE_VUT_95_QC"
201103260300
1
"NEE_VUT_95_QC"
201103261230
1
"NEE_VUT_95_QC"
201103261330
2
"NEE_VUT_95_QC"
201103261630
1
"NEE_VUT_95_QC"
201103261930
2
"NEE_VUT_95_QC"
201103262100
1
"NEE_VUT_95_QC"
201103270000
13441
"NEE_VUT_95_QC"
gap_stat.head().collect()
shape: (5, 5)
gap_len
variable
site
total_obs
end
u32
str
str
i32
datetime[μs]
16992
"TA_F_MDS_QC"
"AR-SLu"
52559
2009-01-01 00:30:00
5
"TA_F_MDS_QC"
"AR-SLu"
52559
2009-12-21 11:00:00
1
"TA_F_MDS_QC"
"AR-SLu"
52559
2009-12-21 17:00:00
1
"TA_F_MDS_QC"
"AR-SLu"
52559
2010-01-06 13:00:00
3
"TA_F_MDS_QC"
"AR-SLu"
52559
2010-01-07 13:00:00
def plot_var_dist(var, small=False, ax=None):
if ax is None: ax = get_grid(1)[0]
ta_gaps = gap_stat.filter(
(pl.col("variable") == var)
).filter(
pl.col("gap_len") < 200 if small else True
).with_column(pl.col("gap_len") / (24 *2 * 7)).collect().to_pandas().hist("gap_len", bins=50, ax=ax)
ax.set_title(f"{var} - { 'gaps < 200' if small else 'all gaps'}")
if not small: ax.set_yscale('log')
ax.set_xlabel("gap length (weeks)")
ax.set_ylabel(f"{'Log' if not small else ''} n gaps")
# plt.xscale('log') plot_var_dist('TA_F_QC')
color_map = dict(zip(scale_meteo.domain, list(sns.color_palette('Set2', n_colors=len(hai.columns)).as_hex())))qc_map = {
'TA': 'TA_F_QC',
'SW_IN': 'SW_IN_F_QC',
'LW_IN': 'LW_IN_F_QC',
'VPD': 'VPD_F_QC',
'WS': 'WS_F_QC',
'PA': 'PA_F_QC',
'P': 'P_F_QC',
'TS': 'TS_F_MDS_1_QC',
'SWC': 'SWC_F_MDS_1_QC',
}def pl_in(col, values):
expr = False
for val in values:
expr |= pl.col(col) == val
return exprgap_stat.filter(pl_in('variable', qc_map.values())
).with_columns([
pl.when(pl.col("gap_len") < 48*7).then(True).otherwise(False).alias("short"),
pl.count().alias("total"),
pl.count().alias("total len"),
]).groupby("short").agg([
(pl.col("gap_len").count() / pl.col("total")).alias("frac_num"),
(pl.col("gap_len").sum() / pl.col("total len")).alias("frac_len")
]).collect()
shape: (2, 3)
short
frac_num
frac_len
bool
list[f64]
list[f64]
false
[0.010775, 0.010775, ... 0.010775]
[63.844386, 63.844386, ... 63.844386]
true
[0.989225, 0.989225, ... 0.989225]
[7.261655, 7.261655, ... 7.261655]
gap_stat.filter(pl_in('variable', qc_map.values())
).with_columns([
pl.when(pl.col("gap_len") < 48*7).then(True).otherwise(False).alias("short"),
pl.count().alias("total"),
pl.count().alias("total len"),
]).groupby("short").agg([
(pl.col("gap_len").count() / pl.col("total")).alias("frac_num"),
(pl.col("gap_len").sum() / pl.col("total len")).alias("frac_len")
]).collect()
shape: (2, 3)
short
frac_num
frac_len
bool
list[f64]
list[f64]
false
[0.010775, 0.010775, ... 0.010775]
[63.844386, 63.844386, ... 63.844386]
true
[0.989225, 0.989225, ... 0.989225]
[7.261655, 7.261655, ... 7.261655]
frac_miss = gap_stat.filter(
pl_in('variable', qc_map.values())
).groupby(["site", "variable"]).agg([
pl.col("gap_len").mean().alias("mean"),
(pl.col("gap_len").sum() / pl.col("total_obs").first()).alias("frac_gap")
])frac_miss.groupby('variable').agg([
pl.col("frac_gap").max().alias("max"),
pl.col("frac_gap").min().alias("min"),
pl.col("frac_gap").std().alias("std"),
pl.col("frac_gap").mean().alias("mean"),
]).collect()
shape: (9, 5)
variable
max
min
std
mean
str
f64
f64
f64
f64
"SW_IN_F_QC"
2.193984
0.000038
0.21952
0.16772
"PA_F_QC"
2.675381
0.000011
0.420542
0.387658
"VPD_F_QC"
2.298799
0.000011
0.297668
0.245457
"SWC_F_MDS_1_QC...
1.487143
0.000011
0.29985
0.314847
"TS_F_MDS_1_QC"
3.090642
0.000011
0.38542
0.282106
"WS_F_QC"
2.675324
0.000798
0.323944
0.257303
"TA_F_QC"
2.299027
0.000011
0.256043
0.198846
"P_F_QC"
1.503296
0.000011
0.330422
0.311598
"LW_IN_F_QC"
7.074603
0.000019
0.704469
0.578246
frac_miss.sort("frac_gap", reverse=True).collect()
shape: (1798, 4)
site
variable
mean
frac_gap
str
str
f64
f64
"US-LWW"
"LW_IN_F_QC"
11267.590909
7.074603
"US-Wi5"
"LW_IN_F_QC"
70128.0
3.992031
"ES-LgS"
"LW_IN_F_QC"
175344.0
3.333093
"US-LWW"
"TS_F_MDS_1_QC"
1320.646341
3.090642
"US-Wi5"
"TS_F_MDS_1_QC"
62.142857
2.897194
"US-Wi0"
"LW_IN_F_QC"
1369.694444
2.814601
"US-ORv"
"PA_F_QC"
3.899334
2.675381
"US-ORv"
"WS_F_QC"
3.899251
2.675324
"US-LWW"
"PA_F_QC"
409.701754
2.665944
"US-Wi5"
"WS_F_QC"
39.57814
2.349861
"US-Wi5"
"PA_F_QC"
51.453972
2.322707
"US-Wi5"
"TA_F_QC"
45.790249
2.299027
...
...
...
...
"CN-Ha2"
"TS_F_MDS_1_QC"
1.0
0.000019
"CN-Qia"
"TS_F_MDS_1_QC"
1.0
0.000019
"CN-Qia"
"SWC_F_MDS_1_QC...
1.0
0.000019
"FI-Lom"
"P_F_QC"
1.0
0.000019
"CN-Ha2"
"TA_F_QC"
1.0
0.000019
"CN-Qia"
"LW_IN_F_QC"
1.0
0.000019
"US-Me6"
"TS_F_MDS_1_QC"
1.0
0.000011
"US-Me6"
"PA_F_QC"
1.0
0.000011
"US-Me6"
"SWC_F_MDS_1_QC...
1.0
0.000011
"US-Me6"
"TA_F_QC"
1.0
0.000011
"US-Me6"
"P_F_QC"
1.0
0.000011
"US-Me6"
"VPD_F_QC"
1.0
0.000011
site_info.filter((pl.col("site") == "US-LWW"))
shape: (1, 3)
start
end
site
datetime[μs]
datetime[μs]
cat
1997-01-01 00:30:00
1999-01-01 00:00:00
"US-LWW"
gap_stat.filter((pl.col("site") == "US-LWW") & (pl.col("variable") == "LW_IN_F_QC" )).collect()
shape: (22, 5)
gap_len
variable
site
total_obs
end
u32
str
str
i32
datetime[μs]
246556
"LW_IN_F_QC"
"US-LWW"
35039
2000-01-01 00:30:00
19
"LW_IN_F_QC"
"US-LWW"
35039
2014-02-11 08:30:00
6
"LW_IN_F_QC"
"US-LWW"
35039
2014-03-08 08:00:00
1
"LW_IN_F_QC"
"US-LWW"
35039
2014-03-08 11:30:00
1
"LW_IN_F_QC"
"US-LWW"
35039
2014-03-08 13:00:00
50
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-18 13:00:00
21
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-19 22:30:00
22
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-24 23:30:00
55
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-26 06:30:00
172
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-27 19:30:00
70
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-01 23:00:00
3
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-03 11:30:00
35
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-03 16:30:00
92
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-04 11:00:00
14
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-11 05:00:00
36
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-11 18:00:00
36
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-12 17:30:00
87
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-13 18:00:00
70
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-16 01:30:00
37
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-17 16:00:00
10
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-21 07:30:00
494
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-21 17:30:00
import matplotlibmatplotlib.rcParams.update({'font.size': 22})
sns.set_style("whitegrid")def plot_var_dist(var, ax=None, small=True):
if ax is None: ax = get_grid(1)[0]
color = color_map[var]
var_qc = qc_map[var]
ta_gaps = gap_stat.filter(
(pl.col("variable") == var_qc)
).filter(
pl.col("gap_len") < (24 * 2 *7) if small else True
).with_column(pl.col("gap_len") / (2 if small else 48 * 7)
).collect().to_pandas().hist("gap_len", bins=50, ax=ax, edgecolor="white", color=color)
ax.set_title(f"{var} - { 'gap length < 1 week' if small else 'all gaps'}")
ax.set_xlabel(f"Gap length ({ 'hour' if small else 'week'})")
ax.set_ylabel(f"Log n gaps")
ax.set_yscale('log')
plt.tight_layout()vars = gap_stat.select(pl.col("variable").unique()).collect()vars.filter(pl.col("variable").str.contains("TA"))
shape: (4, 1)
variable
str
"NEE_VUT_USTAR5...
"TA_F_MDS_QC"
"NEE_CUT_USTAR5...
"TA_F_QC"
for ax, var in zip(get_grid(9,3,3, figsize=(15,12), sharey=False), list(var_type.categories)):
plot_var_dist(var, ax=ax)
plt.savefig(base_path_img / "gap_len_dist_small.pdf")
for ax, var in zip(get_grid(9,3,3, figsize=(15,12), sharey=False), list(var_type.categories)):
plot_var_dist(var, ax=ax, small=False)
plt.savefig(base_path_img / "gap_len_dist.pdf")
Square Root Filter
Numerical Stability
from meteo_imp.kalman.performance import *err = cache_disk(cache_dir / "fuzz_sr")(fuzz_filter_SR)(100, 110) # this is already handling the random seedp = plot_err_sr_filter(err)
save_show_plot(p, "numerical_stability")
Performance
@cache_disk(cache_dir / "perf_sr")
def get_perf_sr():
reset_seed()
return perf_comb_params('filter', use_sr_filter=[True, False], rep=range(100),
n_obs = 100,
n_dim_obs=9,
n_dim_state=18,
n_dim_contr=14,
p_missing=0,
bs=20 ,
init_method = 'local_slope'
) perf1 = (get_perf_sr()
.groupby('use_sr_filter')
.agg(pl.col("time").mean())
.with_column(
pl.when(pl.col("use_sr_filter"))
.then(pl.lit("Square Root Filter"))
.otherwise(pl.lit("Standard Filter"))
.alias("Filter type")
))perf1
shape: (2, 3)
use_sr_filter
time
Filter type
bool
f64
str
true
0.231654
"Square Root Fi...
false
0.150772
"Standard Filte...
(perf1[0, 'time'] - perf1[1, 'time']) / perf1[1, 'time'] * 10053.645095282102254
plot_perf_sr = alt.Chart(perf1.to_pandas()).mark_bar(size = 50).encode(
x=alt.X('Filter type', axis=alt.Axis(labelAngle=0)),
y=alt.Y('time', scale=alt.Scale(zero=False), title="time [s]"),
color=alt.Color('Filter type',
scale = alt.Scale(scheme = 'accent'))
).properties(width=300)save_show_plot(plot_perf_sr, "perf_sr")